import subprocess

model_name = "hf-models/glm-4-9b-chat"

api_server_command = [
    "python",
    "-m",
    "vllm.entrypoints.openai.api_server",
    "--model",
    model_name,
    "--dtype",
    "float16",
    "--api-key",
    "",
    "--tensor-parallel-size",
    "4",
    "--trust-remote-code",
    "--gpu-memory-utilization",
    "0.8",
    "--disable-log-requests",
    "--disable-log-stats",
    "--port",
    "8000",
    # 多卡跑多 模型时, vllm GPU blocks: 0 https://github.com/vllm-project/vllm/issues/2248
    # "--enforce-eager"
]
api_process = subprocess.Popen(
    api_server_command, text=True)

print("开始启动 api 服务")
# 启动子进程1
chainlit_ui_process = subprocess.Popen(
    ['python', '-m', 'chainlit', 'run', 'chainlit_ui.py', '--host', '0.0.0.0', '--port', '7860', "--ci", "--headless"])


try:
    api_process.wait()
    chainlit_ui_process.wait()
# except KeyboardInterrupt:
#     print("Shutting down servers.")
#     chainlit_ui_process.terminate()
#     api_process.terminate()
#     api_process.wait()
#     chainlit_ui_process.wait()
finally:
    api_process.kill()
    chainlit_ui_process.kill()
    print("Servers shut down.")
